INSTRUCTIONS
Imports and function code has been moved to the bottom of the page. Please locate the "RUN FIRST" heading toward the bottom of the page and run the cells from that point to the end of the notebook first. The following link will take you to the run first heading
Introduction/Business Problem
A marketing company specializing in producing and mailing direct mail marketing pieces for their clients, independent small businesses like hair salons and restaurants in Los Angeles county California, is looking to expand into Ventura and Santa Barbara counties. With years of experience and by tracking the redemption of coupon codes, the leadership at the marketing company has a good understanding of what neighborhoods are best suited to receive the direct mail marketing pieces.
The problem is a lack of similar knowledge of neighborhoods in Ventura and Santa Barbara counties. Targeting the correct neighborhoods is important since unlike email marketing that has a very low cost, direct physical mail pieces are relatively expensive to produce and deliver.
The deliverables for this project are:
1. Find census block level areas in Santa Barbara and Ventura that are most like the know target areas in Los Angeles. This is who will be mailed the marketing material by the marketing company on behalf of their small business clients.
2. Find businesses by category that are within 5 kilometers of the target areas. These are the marketing company’s potential clients.
| US CENSUS DATA | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Description | Column Name | Supporting Data from US Census |
|---|---|---|
| SEX / AGE | ||
| Male 50 and older | P_M5069E | B01001_016E + B01001_017E + B01001_018E + B01001_019E + B01001_020E + B01001_021E +B01001_022E + B01001_023E + B01001_024E + B01001_025E |
| Female 50 and older | P_F5069E | B01001_040E + B01001_041E + B01001_042E + B01001_043E + B01001_044E + B01001_045E + B01001_046E + B01001_047E + B01001_048E + B01001_049E |
| HOUSEHOLD INCOME | ||
| 100k or more | P_HI100E | B19001_014E + B19001_015E + B19001_016E + B19001_017E |
| FAMILY TYPE | ||
| No related persons under 18 | P_NCE | B11004_007E |
| EDUCATIONAL ATTAINMENT | ||
| Bachelors Degree or higher | P_BACH | B15003_022E + B15003_023E + B15003_024E + B15003_025E |
#Census Blocks targeted for mailers in LA provided by marketing company
#Use data in US Census info above to create a similar file for Ventura and Santa Barbara
df_model = pd.read_json(f"https://jordan-arthur.github.io/datascicapstone/LA_targets.json")
df_model.head(5)
df_model["geoid"] = pd.to_numeric(df_model["geoid"])
tempdf=df_model.copy()
tempdf.columns=('geoid','Male50+','Female50+','Income','NoKids','Education','ngroup','target')
tempdf = tempdf.groupby(["target"])['Male50+','Female50+','Income','NoKids','Education'].mean().T.reset_index()
tempdf.columns=('target','exclude','include')
tempdf['exclude']=tempdf['exclude']*100
tempdf['include']=tempdf['include']*100
tempdf.head()
ax = plt.gca()
plt.title('Include compared to Exclude')
tempdf.plot(kind='line',x='target',y='exclude',ax=ax)
tempdf.plot(kind='line',x='target',y='include', color='red', ax=ax)
plt.ylabel('Percent to total')
plt.xlabel('Los Angeles')
plt.show()
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
X = np.asarray(df_model[['P_M50E','P_F50E','P_HI100E','P_NCE','P_BACH']])
X = np.nan_to_num(X)
y = np.asarray(df_model['target'])
X = preprocessing.StandardScaler().fit(X).transform(X)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape, y_train.shape)
print ('Test set:', X_test.shape, y_test.shape)
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR
yhat_prob = LR.predict_proba(X_test)
yhat_prob
yhat = LR.predict(X_test)
yhat
jaccard_score(y_test, yhat)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['target=1','target=0'],normalize= False, title='Confusion matrix')
print (classification_report(y_test, yhat))
vgeojson = open("counties_037.json", "r")
dfve = pd.read_json(vgeojson)
dfve = json_normalize(dfve['features'])
dfve=dfve[['properties.GEOID','geometry.coordinates','properties.INTPTLAT','properties.INTPTLON']]
dfve= dfve.rename(columns={'properties.GEOID': 'geoid'})
dfve["geoid"] = pd.to_numeric(dfve["geoid"])
dfve = dfve.merge(df_model, left_on='geoid', right_on='geoid')
#Lon and Lat are reversed, use helper function to change order
for index, row in dfve.iterrows():
pts=row['geometry.coordinates'][0]
p=revpoints(pts)
dfve.head()
# find matching areas in Ventura and SB counties.
#Los Angeles 037
#Ventura 111
#Santa Barbara 083
minmatchv=0.75
targetradius=5000
county='111'
dfCounty = prepareCensusData(pd.read_json(addcounty(county)))
county='083'
dfCounty = pd.concat([dfCounty ,prepareCensusData(pd.read_json(addcounty(county)))
],ignore_index=True).drop_duplicates().reset_index(drop=True)
dfCounty.shape
Xv = np.asarray(dfCounty[['P_M50E','P_F50E','P_HI100E','P_NCE','P_BACH']])
Xv = np.nan_to_num(Xv)
Xv = preprocessing.StandardScaler().fit(Xv).transform(Xv)
Xv
ynew = LR.predict_proba(Xv)
dff = pd.DataFrame(data=ynew)
dfCounty['target']=dff[1]
dfCounty["minmatch"] = np.where((dfCounty["target"] >= minmatchv) , 1,0)
dfCounty["geoid"] = pd.to_numeric(dfCounty["geoid"])
tempdfc=dfCounty.copy()
tempdfc.columns=('geoid','Male50+','Female50+','Income','NoKids','Education','ngroup','target')
tempdfc = tempdfc.groupby(["target"])['Male50+','Female50+','Income','NoKids','Education'].mean().T.reset_index()
tempdfc.columns=('target','exclude','include')
tempdfc['exclude']=tempdfc['exclude']*100
tempdfc['include']=tempdfc['include']*100
tempdfc.head()
ax = plt.gca()
plt.title('Include compared to Exclude')
tempdfc.plot(kind='line',x='target',y='exclude',ax=ax)
tempdfc.plot(kind='line',x='target',y='include', color='red', ax=ax)
plt.ylabel('Percent to total')
plt.xlabel('Santa Barbara and Ventura')
plt.show()
dfCounty.groupby(["minmatch"])["P_M50E","P_F50E","P_HI100E","P_NCE","P_BACH"].median()
#First we need to get the json files for these counties and join them
county='111'
dfCounty_geoA = pd.read_json(f"https://jordan-arthur.github.io/datascicapstone/counties_{county}.json")
county='083'
dfCounty_geoB = pd.read_json(f"https://jordan-arthur.github.io/datascicapstone/counties_{county}.json")
frames = [dfCounty_geoA, dfCounty_geoB]
dfCounty_geo = pd.concat(frames)
dfCounty_geo = json_normalize(dfCounty_geo['features'])
dfCounty_geo.head(2)
#We don't need all the columns so lets trim the dataframe down
dfCounty_geo=dfCounty_geo[['properties.GEOID','geometry.coordinates','properties.INTPTLAT','properties.INTPTLON']]
#rename and change the datatype of the 'properties.GEOID' column to match our census dataframe
dfCounty_geo= dfCounty_geo.rename(columns={'properties.GEOID': 'geoid'})
dfCounty_geo["geoid"] = pd.to_numeric(dfCounty_geo["geoid"])
#Join the geography data to the census block data
dfCounty_geo = dfCounty_geo.merge(dfCounty, left_on='geoid', right_on='geoid')
dfCounty_geo.head(2)
#The lon and lat are not in the order we expect, lets reverse them
for index, row in dfCounty_geo.iterrows():
pts=row['geometry.coordinates'][0]
p=revpoints(pts)
dfCounty_geo.head(2)
rv = add_bboxrec(dfCounty_geo)
dfCounty_geo.head(2)
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
m2 = folium.Map([34.3876, -119.2346],tiles="cartodbpositron", zoom_start=10,control_scale = True)
l=addlayer10("0 to 25", dfCounty_geo,'target',-0.01 ,0.25,m2,False)
l=addlayer10("25 to 50", dfCounty_geo,'target',0.25 ,0.50,m2,False)
l=addlayer10("50 to 75", dfCounty_geo,'target',0.50 ,0.75,m2,False)
l=addlayer10("75 or more", dfCounty_geo,'target',0.75 ,1,m2,True)
dft=dfCounty_geo[dfCounty_geo['target'] >= minmatchv]
feature_group = FeatureGroup(name="bounding box", show=1)
feature_group2 = FeatureGroup(name="5k radius", show=1)
for index, row in dft.iterrows():
l=ast.literal_eval(row['recbox'])
feature_group.add_child(folium.Rectangle(l, color="red", weight=1, opacity=1))
# feature_group.add_child(folium.PolyLine(l, color="red", weight=1, opacity=1))
feature_group2.add_child(
folium.Circle(
radius=targetradius,
location=[float(row['properties.INTPTLAT']), float(row['properties.INTPTLON'])],
color='blue',
fill=False,
weight=1,
))
feature_group2.add_to(m2)
feature_group.add_to(m2)
folium.LayerControl( collapsed=False,).add_to(m2)
m2
dft=dfCounty_geo[dfCounty_geo['target'] >= minmatchv]
venu_data = []
for index, row in dft.iterrows():
l=ast.literal_eval(row['recbox'])
geoid=row['geoid']
location=f'sw={l[0][0]},{l[0][1]}&ne={l[1][0]},{l[1][1]}'
venu_data.append(getvenues(location,geoid))
location=f"ll={row['properties.INTPTLAT']},{row['properties.INTPTLON']}&radius=5000"
venu_data.append(getvenues(location,geoid))
venu_data = pd.concat(venu_data)
venu_data = venu_data.drop_duplicates().reset_index(drop=True)
#appended_data = appended_data.reset_index(drop=True)
for index, row in venu_data.iterrows():
folium.Circle(
radius=5,
location=[float(row['lat']), float(row['lng'])],
color='green',
fill=False,
weight=2,
).add_to(m2)
m2
# SAVE MAP to HTML file
#html_string = m2.get_root().render()
#with open("map_capstone.html", "w") as text_file:
# text_file.write(html_string)
from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:80% !important; } td { text-align:left !important; } </style>"))
display(HTML("<style> td { text-align:left !important; } th { text-align:left !important; } </style>"))
!pip install pyshp
!pip install folium
import pandas as pd
from pandas.io.json import json_normalize
import shapefile
import json
from json import dumps
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium import FeatureGroup, LayerControl, Map, Marker, Popup
from sklearn.metrics import jaccard_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import itertools
import ast
#MAPPING FUNCTIONS
def getFillColor(value):
b = int(max(0, 255*(1 - value)))
g = int(max(0, 255*(1 - value)))
r = 255 #int(max(0, 255*(1 - value)))
hexcolour = '#%02x%02x%02x' % (r,g,b)
return hexcolour
def addlayer10(layername, layerdf,testvalue, minval, maxval, themap, showlayer):
dft=layerdf[(layerdf[testvalue] > minval) & (layerdf[testvalue] <= maxval)]
feature_group = FeatureGroup(name=layername, show=showlayer)
for index, row in dft.iterrows():
lat=float(row['properties.INTPTLAT'])
lon=float(row['properties.INTPTLON'])
pts=row['geometry.coordinates'][0]
fillcolor=getFillColor(row[testvalue])
borderopacity=1
bordercolor='#efefef'
tv='Match Percent: {:6.1f}%'.format(row[testvalue]*100)
hhi='HHI 100+: {:6.1f}%'.format(row['P_HI100E']*100)
edu='Education: {:6.1f}%'.format(row['P_BACH']*100)
nokids='No Children at Home: {:6.1f}%'.format(row['P_NCE']*100)
#'P_M50E','P_F50E','P_HI100E','P_NCE',''
htmlval=f"<ul><li>{row['geoid']}</li><li>{tv}</li><li>{edu}</li><li>{hhi}</li><li>{nokids}</li><li>{lat}, {lon}</li></ul>"
feature_group.add_child(folium.vector_layers.Polygon(locations=pts,fill_opacity=0.5,opacity=borderopacity,weight=1,color=bordercolor, fill_color=fillcolor, popup=Popup(html=htmlval)))
feature_group.add_to(themap)
return 1
#FUNCTIONS TO IMPORT AND PREPARE US CENSUS DATA
uscensuskey="ec05fa5dc1b361505043c40ebcd446a6ace47778"
def addcounty(county):
childr= "B11004_001E,B11004_007E"
income= "B19001_001E,B19001_014E,B19001_015E,B19001_016E,B19001_017E"
educac= "B15003_001E,B15003_022E,B15003_023E,B15003_024E,B15003_025E"
agesex='group(B01001)'
url=f"https://api.census.gov/data/2017/acs/acs5?get={childr},{income},{educac},{agesex}&for=block%20group:*&in=state:06&in=county:{county}&in=tract:*&key={uscensuskey}"
return url
def prepareCensusData(dfin):
df2=dfin.copy()
#use first row of data for the column names
df2.columns = df2.iloc[0]
df2.drop([df2.index[0]], inplace=True)
#add geoid column to join to geojson block group
df2["geoid"] = df2["state"] + df2["county"] + df2["tract"] + df2["block group"]
#limit columns to just the geoid and the data values
df2=df2.filter(regex='(geoid)|([0-9]E$)',axis=1)
c = df2.filter(regex='[0-9]E$',axis=1).columns
df2[c]=df2[c].astype(int)
#Male age 50+
df2["P_M50E"]=df2["B01001_016E"]+df2["B01001_017E"]+df2["B01001_018E"]+df2["B01001_019E"]+df2["B01001_020E"]+df2["B01001_021E"]
#Female age 50+
df2["P_F50E"]=df2["B01001_040E"]+df2["B01001_041E"]+df2["B01001_042E"]+df2["B01001_043E"]+df2["B01001_044E"]+df2["B01001_045E"]
#HH Income 100 to 200+
df2["P_HI100E"]=df2["B19001_014E"]+df2["B19001_015E"]+df2["B19001_016E"]+df2["B19001_017E"]
#No children living at home
df2["P_NCE"]=df2["B11004_007E"]
#Education level, bachalors degree or better
df2["P_BACH"]=df2["B15003_022E"]+df2["B15003_023E"]+df2["B15003_024E"]+df2["B15003_025E"]
#Create percents to total
c = df2.filter(regex='^P_',axis=1).columns
df2[c]=df2[c].astype(float)
df2["P_M50E"]=df2["P_M50E"]/df2["B01001_001E"]
df2["P_F50E"]=df2["P_F50E"]/df2["B01001_001E"]
df2["P_HI100E"]=df2["P_HI100E"]/df2["B19001_001E"]
df2["P_NCE"]=df2["P_NCE"]/df2["B11004_001E"]
df2["P_BACH"]=df2["P_BACH"]/df2["B15003_001E"]
df2=df2.filter(regex='(geoid)|(ngroup)|(^P_)',axis=1)
df2["geoid"] = pd.to_numeric(df2["geoid"])
return df2
#DATA FUNCTIONS
#ADD A BOUNDING BOX TO CENSUS BLOCKS
def add_bboxrec(dfC):
for j, valj in dfC.iterrows():
gc=valj['geometry.coordinates']
gid=valj['geoid']
ff = pd.DataFrame(gc[0])
points=[]
try:
min0=ff[0].min()-0.005
max0=ff[0].max()+0.005
min1=ff[1].min()-0.005
max1=ff[1].max()+0.005
except:
min0=0
max0=0
min1=0
max1=0
else:
points.append(tuple([min0,min1]))
points.append(tuple([max0,max1]))
dfC.loc[dfC['geoid'] == gid, 'recbox'] = f'{points}'
return j
def revpoints(p):
for r in pts:
t=r[1]
r[1]=r[0]
r[0]=t
return p
import requests
CLIENT_ID = 'SYHKXVG2PH0EDA5SSLGIUXANZUFRXCZWOROQSYRHTUCQ15UE' # your Foursquare ID
CLIENT_SECRET = 'DOPPZHDPJ3F5RNQZLX5NVJ5XC3IZNYXHGUQNQ55H5BYV1JQA' # your Foursquare Secret
VERSION = '20170104' # Foursquare API version
def getvenues(location,geoid):
url=f"https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}&{location}&intent=browse&categoryId=4bf58dd8d48988d110951735"
data = {'name':[],
'id':[],
'lat':[],
'lng':[],
'address':[],
'postalCode':[],
'geoid':[]
}
reb=pd.DataFrame(data)
re=pd.DataFrame(requests.get(url).json()["response"]['groups'][0]['items'])
try:
re2 = json_normalize(re['venue'])
re2=re2[['name','id','location.lat','location.lng','location.address','location.postalCode']]
re2.columns=('name','id','lat','lng','address','postalCode')
except:
err=1
else:
reb=re2
reb['geoid']=geoid
return reb
# As found in the class project
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
print(confusion_matrix(y_test, yhat, labels=[1,0]))
This is the code I used on a local machine to convert the .shp file to json file for only the counties I need for this project
reader = shapefile.Reader("d:/tl_2019_06_bg/tl_2019_06_bg.shp") fields = reader.fields[1:] field_names = [field[0] for field in fields] buffer = [] i=0 for sr in reader.shapeRecords(): if sr.record[1]=="111" or sr.record[1]=="083" or sr.record[1]=="037": atr = dict(zip(field_names, sr.record)) geom = sr.shape.geo_interface buffer.append(dict(type="Feature", geometry=geom, properties=atr))
geojson = open("counties.json", "w") geojson.write(dumps({"type": "FeatureCollection", "features": buffer}, indent=2) + "\n") geojson.close()
</code>